In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
data = pd.read_excel("sales_data.xlsx")
In [3]:
data.head()
Out[3]:
CustomerID TOTAL_ORDERS REVENUE AVERAGE_ORDER_VALUE CARRIAGE_REVENUE AVERAGESHIPPING FIRST_ORDER_DATE LATEST_ORDER_DATE AVGDAYSBETWEENORDERS DAYSSINCELASTORDER ... WEEK3_DAY16_DAY23_REVENUE WEEK4_DAY24_DAY31_REVENUE TIME_0000_0600_ORDERS TIME_0601_1200_ORDERS TIME_1200_1800_ORDERS TIME_1801_2359_ORDERS TIME_0000_0600_REVENUE TIME_0601_1200_REVENUE TIME_1200_1800_REVENUE TIME_1801_2359_REVENUE
0 22 124 11986.54 96.67 529.59 4.27 2016-12-30 2021-10-24 14.19 1 ... 2592.18 2409.71 12 61 37 14 1067.92 5674.65 3916.53 1327.44
1 29 82 11025.96 134.46 97.92 1.19 2018-03-31 2021-10-24 15.89 1 ... 2807.66 4106.19 1 13 24 44 30.25 3143.69 2205.47 5646.55
2 83 43 7259.69 168.83 171.69 3.99 2017-11-30 2021-10-24 33.12 1 ... 713.94 2861.90 0 6 12 25 0.00 707.98 2560.91 3990.80
3 95 44 6992.27 158.92 92.82 2.11 2019-04-09 2021-10-24 21.11 1 ... 997.02 1260.08 0 10 13 21 0.00 1908.99 2025.05 3058.23
4 124 55 6263.44 113.88 179.04 3.26 2020-10-23 2021-10-24 6.65 1 ... 2725.66 768.46 6 9 27 13 501.34 1197.83 3004.77 1559.50

5 rows × 40 columns

Data Desciption

In [4]:
data.describe()
Out[4]:
CustomerID TOTAL_ORDERS REVENUE AVERAGE_ORDER_VALUE CARRIAGE_REVENUE AVERAGESHIPPING AVGDAYSBETWEENORDERS DAYSSINCELASTORDER MONDAY_ORDERS TUESDAY_ORDERS ... WEEK3_DAY16_DAY23_REVENUE WEEK4_DAY24_DAY31_REVENUE TIME_0000_0600_ORDERS TIME_0601_1200_ORDERS TIME_1200_1800_ORDERS TIME_1801_2359_ORDERS TIME_0000_0600_REVENUE TIME_0601_1200_REVENUE TIME_1200_1800_REVENUE TIME_1801_2359_REVENUE
count 5000.000000 5000.00000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.00000 ... 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000 5000.000000
mean 40709.227800 12.87040 1681.523840 136.537378 46.036376 3.592574 163.159618 87.420000 1.629000 1.75440 ... 421.826908 474.462852 1.028800 3.746200 4.434000 3.661400 131.062636 486.863868 584.731626 478.865710
std 49949.848017 12.67988 1998.618678 91.651569 47.879226 2.021360 259.699496 80.156513 2.236506 2.43394 ... 643.449120 617.579321 2.174331 4.700234 5.044793 4.581894 331.289349 789.029911 804.290026 743.244248
min 1.000000 1.00000 38.500000 10.680000 0.000000 0.000000 0.000000 1.000000 0.000000 0.00000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1687.500000 3.00000 315.097500 83.025000 9.980000 2.500000 21.670000 7.000000 0.000000 0.00000 ... 0.000000 80.000000 0.000000 1.000000 1.000000 1.000000 0.000000 35.000000 89.990000 1.000000
50% 13765.000000 8.00000 966.725000 113.160000 24.985000 3.660000 57.635000 68.000000 1.000000 1.00000 ... 194.990000 264.485000 0.000000 2.000000 3.000000 2.000000 0.000000 236.440000 299.840000 208.990000
75% 71891.500000 20.00000 2493.072500 160.272500 76.862500 4.790000 170.357500 171.250000 2.000000 3.00000 ... 604.085000 651.487500 1.000000 6.000000 7.000000 5.000000 131.282500 661.952500 797.032500 620.117500
max 277160.000000 156.00000 34847.400000 1578.880000 529.590000 35.990000 1409.500000 207.000000 19.000000 23.00000 ... 12946.220000 8274.970000 34.000000 61.000000 58.000000 64.000000 7700.240000 23832.770000 17749.010000 12995.020000

8 rows × 38 columns

The describe() method returns description of feature and their interrealtions-

count - The number of not-empty values in that column.

mean - The average (mean) value of that column.

std - The standard deviation of that column.

min - the minimum value in that column.

25% - 25% percentile of column.

50% - 50% percentile of column.

75% - 75% percentile of column.

max - the maximum value in that column.

In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   CustomerID                 5000 non-null   int64         
 1   TOTAL_ORDERS               5000 non-null   int64         
 2   REVENUE                    5000 non-null   float64       
 3   AVERAGE_ORDER_VALUE        5000 non-null   float64       
 4   CARRIAGE_REVENUE           5000 non-null   float64       
 5   AVERAGESHIPPING            5000 non-null   float64       
 6   FIRST_ORDER_DATE           5000 non-null   datetime64[ns]
 7   LATEST_ORDER_DATE          5000 non-null   datetime64[ns]
 8   AVGDAYSBETWEENORDERS       5000 non-null   float64       
 9   DAYSSINCELASTORDER         5000 non-null   int64         
 10  MONDAY_ORDERS              5000 non-null   int64         
 11  TUESDAY_ORDERS             5000 non-null   int64         
 12  WEDNESDAY_ORDERS           5000 non-null   int64         
 13  THURSDAY_ORDERS            5000 non-null   int64         
 14  FRIDAY_ORDERS              5000 non-null   int64         
 15  SATURDAY_ORDERS            5000 non-null   int64         
 16  SUNDAY_ORDERS              5000 non-null   int64         
 17  MONDAY_REVENUE             5000 non-null   float64       
 18  TUESDAY_REVENUE            5000 non-null   float64       
 19  WEDNESDAY_REVENUE          5000 non-null   float64       
 20  THURSDAY_REVENUE           5000 non-null   float64       
 21  FRIDAY_REVENUE             5000 non-null   float64       
 22  SATURDAY_REVENUE           5000 non-null   float64       
 23  SUNDAY_REVENUE             5000 non-null   float64       
 24  WEEK1_DAY01_DAY07_ORDERS   5000 non-null   int64         
 25  WEEK2_DAY08_DAY15_ORDERS   5000 non-null   int64         
 26  WEEK3_DAY16_DAY23_ORDERS   5000 non-null   int64         
 27  WEEK4_DAY24_DAY31_ORDERS   5000 non-null   int64         
 28  WEEK1_DAY01_DAY07_REVENUE  5000 non-null   float64       
 29  WEEK2_DAY08_DAY15_REVENUE  5000 non-null   float64       
 30  WEEK3_DAY16_DAY23_REVENUE  5000 non-null   float64       
 31  WEEK4_DAY24_DAY31_REVENUE  5000 non-null   float64       
 32  TIME_0000_0600_ORDERS      5000 non-null   int64         
 33  TIME_0601_1200_ORDERS      5000 non-null   int64         
 34  TIME_1200_1800_ORDERS      5000 non-null   int64         
 35  TIME_1801_2359_ORDERS      5000 non-null   int64         
 36  TIME_0000_0600_REVENUE     5000 non-null   float64       
 37  TIME_0601_1200_REVENUE     5000 non-null   float64       
 38  TIME_1200_1800_REVENUE     5000 non-null   float64       
 39  TIME_1801_2359_REVENUE     5000 non-null   float64       
dtypes: datetime64[ns](2), float64(20), int64(18)
memory usage: 1.5 MB
In [6]:
data.isnull().sum()
Out[6]:
CustomerID                   0
TOTAL_ORDERS                 0
REVENUE                      0
AVERAGE_ORDER_VALUE          0
CARRIAGE_REVENUE             0
AVERAGESHIPPING              0
FIRST_ORDER_DATE             0
LATEST_ORDER_DATE            0
AVGDAYSBETWEENORDERS         0
DAYSSINCELASTORDER           0
MONDAY_ORDERS                0
TUESDAY_ORDERS               0
WEDNESDAY_ORDERS             0
THURSDAY_ORDERS              0
FRIDAY_ORDERS                0
SATURDAY_ORDERS              0
SUNDAY_ORDERS                0
MONDAY_REVENUE               0
TUESDAY_REVENUE              0
WEDNESDAY_REVENUE            0
THURSDAY_REVENUE             0
FRIDAY_REVENUE               0
SATURDAY_REVENUE             0
SUNDAY_REVENUE               0
WEEK1_DAY01_DAY07_ORDERS     0
WEEK2_DAY08_DAY15_ORDERS     0
WEEK3_DAY16_DAY23_ORDERS     0
WEEK4_DAY24_DAY31_ORDERS     0
WEEK1_DAY01_DAY07_REVENUE    0
WEEK2_DAY08_DAY15_REVENUE    0
WEEK3_DAY16_DAY23_REVENUE    0
WEEK4_DAY24_DAY31_REVENUE    0
TIME_0000_0600_ORDERS        0
TIME_0601_1200_ORDERS        0
TIME_1200_1800_ORDERS        0
TIME_1801_2359_ORDERS        0
TIME_0000_0600_REVENUE       0
TIME_0601_1200_REVENUE       0
TIME_1200_1800_REVENUE       0
TIME_1801_2359_REVENUE       0
dtype: int64
  • Data is already formatted in correct order
  • Data does not contain the Null value
  • Data is already cleaned

Exploratory Data Analysis

Number of unique value in each column

In [7]:
for x in data.columns:
    print(x,'-',len(data[x].unique()))
CustomerID - 5000
TOTAL_ORDERS - 85
REVENUE - 4635
AVERAGE_ORDER_VALUE - 4113
CARRIAGE_REVENUE - 1175
AVERAGESHIPPING - 567
FIRST_ORDER_DATE - 1928
LATEST_ORDER_DATE - 207
AVGDAYSBETWEENORDERS - 3563
DAYSSINCELASTORDER - 207
MONDAY_ORDERS - 19
TUESDAY_ORDERS - 22
WEDNESDAY_ORDERS - 22
THURSDAY_ORDERS - 23
FRIDAY_ORDERS - 24
SATURDAY_ORDERS - 24
SUNDAY_ORDERS - 23
MONDAY_REVENUE - 2151
TUESDAY_REVENUE - 2201
WEDNESDAY_REVENUE - 2262
THURSDAY_REVENUE - 2582
FRIDAY_REVENUE - 2378
SATURDAY_REVENUE - 2149
SUNDAY_REVENUE - 2573
WEEK1_DAY01_DAY07_ORDERS - 27
WEEK2_DAY08_DAY15_ORDERS - 34
WEEK3_DAY16_DAY23_ORDERS - 33
WEEK4_DAY24_DAY31_ORDERS - 32
WEEK1_DAY01_DAY07_REVENUE - 3026
WEEK2_DAY08_DAY15_REVENUE - 2900
WEEK3_DAY16_DAY23_REVENUE - 2947
WEEK4_DAY24_DAY31_REVENUE - 3386
TIME_0000_0600_ORDERS - 22
TIME_0601_1200_ORDERS - 41
TIME_1200_1800_ORDERS - 41
TIME_1801_2359_ORDERS - 36
TIME_0000_0600_REVENUE - 1413
TIME_0601_1200_REVENUE - 3203
TIME_1200_1800_REVENUE - 3512
TIME_1801_2359_REVENUE - 3128

Correlation between each columns

In [8]:
data.corr()
Out[8]:
CustomerID TOTAL_ORDERS REVENUE AVERAGE_ORDER_VALUE CARRIAGE_REVENUE AVERAGESHIPPING AVGDAYSBETWEENORDERS DAYSSINCELASTORDER MONDAY_ORDERS TUESDAY_ORDERS ... WEEK3_DAY16_DAY23_REVENUE WEEK4_DAY24_DAY31_REVENUE TIME_0000_0600_ORDERS TIME_0601_1200_ORDERS TIME_1200_1800_ORDERS TIME_1801_2359_ORDERS TIME_0000_0600_REVENUE TIME_0601_1200_REVENUE TIME_1200_1800_REVENUE TIME_1801_2359_REVENUE
CustomerID 1.000000 -0.608092 -0.556489 -0.226023 -0.576043 0.025649 0.279182 0.375343 -0.470777 -0.473127 ... -0.463766 -0.491788 -0.294194 -0.479409 -0.525385 -0.472961 -0.265095 -0.408657 -0.477957 -0.427217
TOTAL_ORDERS -0.608092 1.000000 0.771996 -0.065208 0.875733 -0.007855 -0.311838 -0.256272 0.741049 0.766660 ... 0.644590 0.663521 0.498192 0.804704 0.861575 0.756866 0.380336 0.593952 0.656071 0.565907
REVENUE -0.556489 0.771996 1.000000 0.366302 0.655447 -0.048932 -0.257746 -0.197782 0.590764 0.624238 ... 0.857369 0.841502 0.387685 0.606809 0.664253 0.598595 0.474491 0.769682 0.848650 0.742100
AVERAGE_ORDER_VALUE -0.226023 -0.065208 0.366302 1.000000 -0.100842 -0.076141 -0.088756 -0.061769 -0.036218 -0.030172 ... 0.305416 0.336919 -0.035982 -0.064526 -0.052744 -0.039114 0.156969 0.260563 0.320375 0.291733
CARRIAGE_REVENUE -0.576043 0.875733 0.655447 -0.100842 1.000000 0.277900 -0.312772 -0.231617 0.651003 0.665464 ... 0.552647 0.549227 0.464812 0.712443 0.765135 0.629639 0.346697 0.502783 0.571861 0.455406
AVERAGESHIPPING 0.025649 -0.007855 -0.048932 -0.076141 0.277900 1.000000 -0.084904 0.104563 -0.000875 -0.001364 ... -0.036116 -0.065299 0.028585 0.004779 -0.000934 -0.039176 0.009738 -0.031486 -0.031684 -0.068210
AVGDAYSBETWEENORDERS 0.279182 -0.311838 -0.257746 -0.088756 -0.312772 -0.084904 1.000000 0.054743 -0.216227 -0.222975 ... -0.197848 -0.236006 -0.168915 -0.251694 -0.268035 -0.229511 -0.138387 -0.194264 -0.221773 -0.185189
DAYSSINCELASTORDER 0.375343 -0.256272 -0.197782 -0.061769 -0.231617 0.104563 0.054743 1.000000 -0.199247 -0.199338 ... -0.182017 -0.209898 -0.123382 -0.214157 -0.226500 -0.181582 -0.089109 -0.152151 -0.177786 -0.138213
MONDAY_ORDERS -0.470777 0.741049 0.590764 -0.036218 0.651003 -0.000875 -0.216227 -0.199247 1.000000 0.519452 ... 0.491319 0.503856 0.356378 0.597853 0.638433 0.565423 0.281904 0.441408 0.500867 0.452333
TUESDAY_ORDERS -0.473127 0.766660 0.624238 -0.030172 0.665464 -0.001364 -0.222975 -0.199338 0.519452 1.000000 ... 0.529672 0.524633 0.348824 0.621192 0.662220 0.589753 0.282089 0.488838 0.540816 0.448681
WEDNESDAY_ORDERS -0.471577 0.778020 0.603069 -0.045912 0.682724 -0.003446 -0.212447 -0.198890 0.529802 0.569239 ... 0.516196 0.518829 0.416920 0.625160 0.664645 0.582135 0.306839 0.468631 0.516384 0.428615
THURSDAY_ORDERS -0.370335 0.726948 0.558130 -0.084160 0.659102 0.011517 -0.273030 -0.024351 0.469841 0.492636 ... 0.462770 0.463650 0.408840 0.595168 0.611846 0.533534 0.316490 0.444646 0.463404 0.386265
FRIDAY_ORDERS -0.483667 0.769469 0.574375 -0.064862 0.696734 0.019625 -0.232092 -0.206935 0.488109 0.516824 ... 0.471130 0.496101 0.402790 0.645718 0.669645 0.538582 0.304349 0.455051 0.493814 0.391406
SATURDAY_ORDERS -0.449325 0.740361 0.543492 -0.052176 0.628805 -0.014055 -0.222225 -0.204895 0.448159 0.461035 ... 0.444974 0.475871 0.347451 0.581930 0.646396 0.575323 0.254627 0.416428 0.461141 0.406882
SUNDAY_ORDERS -0.451761 0.686662 0.530905 -0.022297 0.574605 -0.056967 -0.246285 -0.309910 0.448086 0.434558 ... 0.445294 0.477119 0.308657 0.520976 0.594665 0.564613 0.231941 0.377689 0.443260 0.443622
MONDAY_REVENUE -0.380556 0.531736 0.724841 0.243571 0.461406 -0.030779 -0.161764 -0.131780 0.729899 0.394304 ... 0.635222 0.598313 0.278808 0.404869 0.457925 0.419697 0.337316 0.489936 0.639136 0.587029
TUESDAY_REVENUE -0.402942 0.565668 0.772849 0.262633 0.470973 -0.030795 -0.176556 -0.130015 0.403057 0.750212 ... 0.675070 0.642869 0.271883 0.447657 0.490323 0.437322 0.365252 0.632775 0.654975 0.534896
WEDNESDAY_REVENUE -0.416220 0.595310 0.768219 0.257205 0.505723 -0.036036 -0.171092 -0.153867 0.422422 0.464397 ... 0.674560 0.637507 0.300025 0.470638 0.515404 0.454811 0.320913 0.610467 0.663820 0.556319
THURSDAY_REVENUE -0.370538 0.559937 0.720234 0.254330 0.489851 -0.010615 -0.219687 -0.038628 0.373843 0.413712 ... 0.607707 0.594011 0.303015 0.451146 0.479510 0.415017 0.355706 0.597961 0.610250 0.483024
FRIDAY_REVENUE -0.442155 0.596513 0.721811 0.224710 0.531873 -0.003912 -0.192580 -0.149907 0.375930 0.428953 ... 0.593177 0.603498 0.310919 0.495785 0.520322 0.421757 0.374992 0.603543 0.597661 0.486365
SATURDAY_REVENUE -0.414425 0.577960 0.672506 0.244077 0.462722 -0.056544 -0.190325 -0.154860 0.360069 0.379650 ... 0.546354 0.580383 0.285694 0.451039 0.490273 0.461371 0.332262 0.521305 0.553988 0.507392
SUNDAY_REVENUE -0.378438 0.467494 0.657808 0.356891 0.383538 -0.077703 -0.189026 -0.233374 0.311378 0.303502 ... 0.584832 0.583116 0.206984 0.340666 0.396799 0.409164 0.305369 0.423818 0.555155 0.582086
WEEK1_DAY01_DAY07_ORDERS -0.451132 0.809635 0.601025 -0.093441 0.717968 0.006628 -0.292334 -0.097728 0.610258 0.606868 ... 0.423593 0.450737 0.412616 0.652578 0.702307 0.602075 0.309972 0.461281 0.513745 0.432380
WEEK2_DAY08_DAY15_ORDERS -0.530644 0.858859 0.693297 -0.028968 0.749608 -0.007318 -0.246650 -0.239091 0.638693 0.671527 ... 0.518090 0.505239 0.431152 0.702725 0.727832 0.649956 0.337609 0.550625 0.578176 0.503615
WEEK3_DAY16_DAY23_ORDERS -0.540388 0.867246 0.678199 -0.044759 0.762518 -0.000608 -0.251000 -0.246593 0.645955 0.672533 ... 0.750503 0.514279 0.430737 0.703152 0.745516 0.653454 0.332205 0.522132 0.574688 0.499447
WEEK4_DAY24_DAY31_ORDERS -0.531401 0.852617 0.640428 -0.059724 0.738734 -0.022932 -0.272608 -0.266361 0.618019 0.644982 ... 0.475042 0.758797 0.415341 0.668946 0.743956 0.657083 0.309816 0.476863 0.553992 0.478313
WEEK1_DAY01_DAY07_REVENUE -0.428908 0.621523 0.803233 0.278793 0.544678 -0.015323 -0.230810 -0.079601 0.486900 0.502245 ... 0.581709 0.589478 0.324960 0.489872 0.542678 0.465757 0.406857 0.610514 0.704966 0.567592
WEEK2_DAY08_DAY15_REVENUE -0.466474 0.642442 0.828383 0.296669 0.539812 -0.042509 -0.198697 -0.173554 0.488143 0.522820 ... 0.610522 0.568112 0.322660 0.518938 0.539273 0.498674 0.375965 0.668898 0.676586 0.617719
WEEK3_DAY16_DAY23_REVENUE -0.463766 0.644590 0.857369 0.305416 0.552647 -0.036116 -0.197848 -0.182017 0.491319 0.529672 ... 1.000000 0.634763 0.316274 0.510481 0.552063 0.502240 0.389918 0.658998 0.725681 0.646828
WEEK4_DAY24_DAY31_REVENUE -0.491788 0.663521 0.841502 0.336919 0.549227 -0.065299 -0.236006 -0.209898 0.503856 0.524633 ... 0.634763 1.000000 0.330197 0.502446 0.580545 0.524907 0.412556 0.623676 0.723194 0.634260
TIME_0000_0600_ORDERS -0.294194 0.498192 0.387685 -0.035982 0.464812 0.028585 -0.168915 -0.123382 0.356378 0.348824 ... 0.316274 0.330197 1.000000 0.326479 0.278594 0.262490 0.813090 0.235849 0.214736 0.197330
TIME_0601_1200_ORDERS -0.479409 0.804704 0.606809 -0.064526 0.712443 0.004779 -0.251694 -0.214157 0.597853 0.621192 ... 0.510481 0.502446 0.326479 1.000000 0.606739 0.378134 0.235058 0.747176 0.446647 0.250428
TIME_1200_1800_ORDERS -0.525385 0.861575 0.664253 -0.052744 0.765135 -0.000934 -0.268035 -0.226500 0.638433 0.662220 ... 0.552063 0.580545 0.278594 0.606739 1.000000 0.528668 0.200621 0.447236 0.768101 0.390811
TIME_1801_2359_ORDERS -0.472961 0.756866 0.598595 -0.039114 0.629639 -0.039176 -0.229511 -0.181582 0.565423 0.589753 ... 0.502240 0.524907 0.262490 0.378134 0.528668 1.000000 0.204670 0.272882 0.409818 0.785251
TIME_0000_0600_REVENUE -0.265095 0.380336 0.474491 0.156969 0.346697 0.009738 -0.138387 -0.089109 0.281904 0.282089 ... 0.389918 0.412556 0.813090 0.235058 0.200621 0.204670 1.000000 0.271433 0.261625 0.258929
TIME_0601_1200_REVENUE -0.408657 0.593952 0.769682 0.260563 0.502783 -0.031486 -0.194264 -0.152151 0.441408 0.488838 ... 0.658998 0.623676 0.235849 0.747176 0.447236 0.272882 0.271433 1.000000 0.537054 0.305957
TIME_1200_1800_REVENUE -0.477957 0.656071 0.848650 0.320375 0.571861 -0.031684 -0.221773 -0.177786 0.500867 0.540816 ... 0.725681 0.723194 0.214736 0.446647 0.768101 0.409818 0.261625 0.537054 1.000000 0.513174
TIME_1801_2359_REVENUE -0.427217 0.565907 0.742100 0.291733 0.455406 -0.068210 -0.185189 -0.138213 0.452333 0.448681 ... 0.646828 0.634260 0.197330 0.250428 0.390811 0.785251 0.258929 0.305957 0.513174 1.000000

38 rows × 38 columns

In [9]:
#Histogram using Pandas
data.hist(figsize=(16,8))
plt.show()
In [10]:
# group data by total order and plot count plot
data.groupby('TOTAL_ORDERS').count().plot(kind='bar',figsize=(20,8))
#plt.legend(loc=10)
plt.show()
In [11]:
#line plot for REVENUE and TOTAL_ORDERS columns
plt.figure(figsize=(20,6))
sns.lineplot(x='TOTAL_ORDERS',y='REVENUE',data=data)
plt.show()
There is Hike in Revenue after the total order is 40
In [12]:
col = ["REVENUE","TOTAL_ORDERS"]
data[col].describe()
Out[12]:
REVENUE TOTAL_ORDERS
count 5000.000000 5000.00000
mean 1681.523840 12.87040
std 1998.618678 12.67988
min 38.500000 1.00000
25% 315.097500 3.00000
50% 966.725000 8.00000
75% 2493.072500 20.00000
max 34847.400000 156.00000
WE CAN CLASSIFY CUSTOMERS OF THE BASIS OF AVERAGE REVENUE i.e. CUSTOMER WITH GREATER THAT AVG REVENUE CONSIDER TO BE GOOD AND BELOW AVG REVENUE IS CONSIDER TO BE "NEED ATTENTION" CUSTOMER.

RFM MODEL

RFM stands for recency, frequency, monetary value. In business analytics, we often use this concept to divide customers into different segments, like high-value customers, medium value customers or low-value customers, and similarly many others. Recency: How recently has the customer made a transaction with us Frequency: How frequent is the customer in ordering/buying some product from us Monetary: How much does the customer spend on purchasing products from us. CUSTOMER SEGMENTATION: -Champions Customers: Bought recently, buy often and spend the most => recency-latest | frequency-high | momentary-high -Potential Customers: Recent customers with average frequency | recency-latest | frequency-avg -Need attention: Customer have => recency-low | frequency-low | low-monetary

Data Modelling

In [13]:
rfm_data = data[['CustomerID','DAYSSINCELASTORDER','AVGDAYSBETWEENORDERS','REVENUE']].rename\
            ({'DAYSSINCELASTORDER':'RECENCY','AVGDAYSBETWEENORDERS':'FREQUENCY','REVENUE':'MONETARY'},axis=1)
In [14]:
rfm_data
Out[14]:
CustomerID RECENCY FREQUENCY MONETARY
0 22 1 14.19 11986.54
1 29 1 15.89 11025.96
2 83 1 33.12 7259.69
3 95 1 21.11 6992.27
4 124 1 6.65 6263.44
... ... ... ... ...
4995 173946 207 0.00 117.49
4996 173987 207 0.00 117.49
4997 174004 207 0.00 117.49
4998 174038 207 0.00 117.49
4999 200783 207 1235.50 94.14

5000 rows × 4 columns

In [15]:
#Descriptive Statistics (Recency)
rfm_data.RECENCY.describe()
Out[15]:
count    5000.000000
mean       87.420000
std        80.156513
min         1.000000
25%         7.000000
50%        68.000000
75%       171.250000
max       207.000000
Name: RECENCY, dtype: float64
In [16]:
#Recency distribution plot
import seaborn as sns
x = rfm_data['RECENCY']
ax = sns.distplot(x)
plt.show()
C:\Users\acer\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Data shows that Low Recency and high Recency Cutomers are More than Mid Range SO, we have to focused on Mid-Range Section if we want to increase the rate of Recency
In [17]:
#Descriptive Statistics (Frequency)
rfm_data.FREQUENCY.describe()
Out[17]:
count    5000.000000
mean      163.159618
std       259.699496
min         0.000000
25%        21.670000
50%        57.635000
75%       170.357500
max      1409.500000
Name: FREQUENCY, dtype: float64
In [18]:
#Frequency distribution plot
import seaborn as sns
x = rfm_data['FREQUENCY']
ax = sns.distplot(x)
plt.show()
C:\Users\acer\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Data shows that high Frequency are very less.
In [19]:
#Descriptive Statistics (Monetary)
rfm_data.MONETARY.describe()
Out[19]:
count     5000.000000
mean      1681.523840
std       1998.618678
min         38.500000
25%        315.097500
50%        966.725000
75%       2493.072500
max      34847.400000
Name: MONETARY, dtype: float64
In [20]:
#Monetary distribution plot
import seaborn as sns
x = rfm_data['MONETARY']
ax = sns.distplot(x)
plt.show()
C:\Users\acer\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Data shows that there is more low Monetary Customers.

SCALING THE RFM DATA

In [21]:
from sklearn.preprocessing import StandardScaler
In [22]:
obj = StandardScaler()
In [23]:
scaled_data = pd.DataFrame(obj.fit_transform(rfm_data.drop('CustomerID',axis=1))).\
rename({0:'RECENCY',1:'FREQUENCY',2:'MONETARY'},axis=1)
scaled_data
Out[23]:
RECENCY FREQUENCY MONETARY
0 -1.078249 -0.573680 5.156585
1 -1.078249 -0.567134 4.675915
2 -1.078249 -0.500781 2.791290
3 -1.078249 -0.547032 2.657474
4 -1.078249 -0.602717 2.292771
... ... ... ...
4995 1.491981 -0.628326 -0.782636
4996 1.491981 -0.628326 -0.782636
4997 1.491981 -0.628326 -0.782636
4998 1.491981 -0.628326 -0.782636
4999 1.491981 4.129571 -0.794320

5000 rows × 3 columns

In [24]:
final_data = pd.concat([data['CustomerID'],scaled_data],axis=1)
final_data
Out[24]:
CustomerID RECENCY FREQUENCY MONETARY
0 22 -1.078249 -0.573680 5.156585
1 29 -1.078249 -0.567134 4.675915
2 83 -1.078249 -0.500781 2.791290
3 95 -1.078249 -0.547032 2.657474
4 124 -1.078249 -0.602717 2.292771
... ... ... ... ...
4995 173946 1.491981 -0.628326 -0.782636
4996 173987 1.491981 -0.628326 -0.782636
4997 174004 1.491981 -0.628326 -0.782636
4998 174038 1.491981 -0.628326 -0.782636
4999 200783 1.491981 4.129571 -0.794320

5000 rows × 4 columns

In [25]:
#Split into four segments using quantiles
quantiles = final_data.quantile(q=[0.25,0.5,0.75])
quantiles = quantiles.to_dict()
In [26]:
quantiles
Out[26]:
{'CustomerID': {0.25: 1687.5, 0.5: 13765.0, 0.75: 71891.5},
 'RECENCY': {0.25: -1.0033874973440888,
  0.5: -0.24230023872696102,
  0.75: 1.0459335227848168},
 'FREQUENCY': {0.25: -0.544875022998473,
  0.5: -0.4063741882436568,
  0.75: 0.02771896748134752},
 'MONETARY': {0.25: -0.6837537436444985,
  0.5: -0.3576822024689197,
  0.75: 0.40609538778700394}}
In [27]:
#Functions to create R, F and M segments
def RScoring(x,p,d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]: 
        return 3
    else:
        return 4
    
def FnMScoring(x,p,d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]: 
        return 2
    else:
        return 1
In [28]:
#Calculate Add R, F and M segment value columns in the existing dataset to show R, F and M segment values
final_data['R'] = final_data['RECENCY'].apply(RScoring, args=('RECENCY',quantiles,))
final_data['F'] = final_data['FREQUENCY'].apply(FnMScoring, args=('FREQUENCY',quantiles,))
final_data['M'] = final_data['MONETARY'].apply(FnMScoring, args=('MONETARY',quantiles,))
final_data.head()
Out[28]:
CustomerID RECENCY FREQUENCY MONETARY R F M
0 22 -1.078249 -0.573680 5.156585 1 4 1
1 29 -1.078249 -0.567134 4.675915 1 4 1
2 83 -1.078249 -0.500781 2.791290 1 3 1
3 95 -1.078249 -0.547032 2.657474 1 4 1
4 124 -1.078249 -0.602717 2.292771 1 4 1
In [29]:
#Calculate and Add RFMGroup value column showing combined concatenated score of RFM
final_data['RFMGroup'] = final_data.R.map(str) + final_data.F.map(str) + final_data.M.map(str)

#Calculate and Add RFMScore value column showing total sum of RFMGroup values
final_data['RFMScore'] = final_data[['R', 'F', 'M']].sum(axis = 1)
final_data.head()
Out[29]:
CustomerID RECENCY FREQUENCY MONETARY R F M RFMGroup RFMScore
0 22 -1.078249 -0.573680 5.156585 1 4 1 141 6
1 29 -1.078249 -0.567134 4.675915 1 4 1 141 6
2 83 -1.078249 -0.500781 2.791290 1 3 1 131 5
3 95 -1.078249 -0.547032 2.657474 1 4 1 141 6
4 124 -1.078249 -0.602717 2.292771 1 4 1 141 6
In [30]:
#Assign Loyalty Level to each customer
Loyalty_Level = ['need attention', 'Potential customers','champions']
Score_cuts = pd.qcut(final_data.RFMScore, q = 3, labels = Loyalty_Level)
final_data['RFM_Loyalty_Level'] = Score_cuts.values

Final Data for our Model Building Process :

In [31]:
final_data.head()
Out[31]:
CustomerID RECENCY FREQUENCY MONETARY R F M RFMGroup RFMScore RFM_Loyalty_Level
0 22 -1.078249 -0.573680 5.156585 1 4 1 141 6 need attention
1 29 -1.078249 -0.567134 4.675915 1 4 1 141 6 need attention
2 83 -1.078249 -0.500781 2.791290 1 3 1 131 5 need attention
3 95 -1.078249 -0.547032 2.657474 1 4 1 141 6 need attention
4 124 -1.078249 -0.602717 2.292771 1 4 1 141 6 need attention
In [32]:
import plotly.express as px
df = px.data.iris()
fig = px.scatter_3d(final_data, x='R', y='F', z='M',color='RFM_Loyalty_Level')
fig.show()
Clearly, We can see that the Data is seperable by lines so it's better to apply Logistic Regression
In [33]:
plt.figure(figsize=(10,6))
sns.scatterplot(y='RFMScore',x='CustomerID',data=final_data,hue='RFM_Loyalty_Level')
Out[33]:
<AxesSubplot:xlabel='CustomerID', ylabel='RFMScore'>

IMPLEMENT LOGISTIC MODEL TO OUR FINAL DATA

In [34]:
from sklearn.linear_model import LogisticRegression
In [35]:
log = LogisticRegression(multi_class='multinomial')
In [36]:
log_data = final_data[['R','F','M','RFM_Loyalty_Level']]

Data of our Need

In [37]:
log_data
Out[37]:
R F M RFM_Loyalty_Level
0 1 4 1 need attention
1 1 4 1 need attention
2 1 3 1 need attention
3 1 4 1 need attention
4 1 4 1 need attention
... ... ... ... ...
4995 4 4 4 champions
4996 4 4 4 champions
4997 4 4 4 champions
4998 4 4 4 champions
4999 4 1 4 champions

5000 rows × 4 columns

In [38]:
#Labelling the Output Column
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
log_data['RFM_Lotalty_Level'] = encoder.fit_transform(log_data['RFM_Loyalty_Level'])
C:\Users\acer\AppData\Local\Temp\ipykernel_12980\1063727757.py:4: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [39]:
X = log_data.iloc[:,0:3]
X
Out[39]:
R F M
0 1 4 1
1 1 4 1
2 1 3 1
3 1 4 1
4 1 4 1
... ... ... ...
4995 4 4 4
4996 4 4 4
4997 4 4 4
4998 4 4 4
4999 4 1 4

5000 rows × 3 columns

In [40]:
y = log_data.iloc[:,-1]
y
Out[40]:
0       2
1       2
2       2
3       2
4       2
       ..
4995    1
4996    1
4997    1
4998    1
4999    1
Name: RFM_Lotalty_Level, Length: 5000, dtype: int32
In [41]:
log.fit(X,y)
Out[41]:
LogisticRegression(multi_class='multinomial')
In [42]:
sample = log_data.sample(100)
x1 = sample.iloc[:,0:3]
y1 = sample.iloc[:,-1]
In [43]:
y_pred = log.predict(x1)
y_pred
Out[43]:
array([0, 2, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 2, 1, 1, 0, 0, 0, 2, 1, 2, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 1, 2, 2, 0, 2, 1, 2, 0, 0,
       0, 0, 0, 2, 2, 1, 0, 0, 2, 0, 1, 0, 2, 2, 2, 2, 0, 0, 1, 0, 2, 0,
       1, 2, 0, 0, 2, 1, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 2, 1, 2,
       0, 0, 2, 2, 2, 1, 2, 0, 2, 1, 1, 2])

Accuracy

In [44]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("Accuracy of Logistic Regression is",accuracy_score(y1,y_pred)*100,"%")
Accuracy of Logistic Regression is 100.0 %
In [45]:
print("Logistic Regression Confusion Matrix\n")
pd.DataFrame(confusion_matrix(y1,y_pred),columns=list(range(0,3)))
Logistic Regression Confusion Matrix

Out[45]:
0 1 2
0 49 0 0
1 0 15 0
2 0 0 36
In [46]:
from mlxtend.plotting import plot_decision_regions

plot_decision_regions(X.values, y.values, log, legend=2,filler_feature_values = {2:3})

plt.show()
C:\Users\acer\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning:

X does not have valid feature names, but LogisticRegression was fitted with feature names